Load the necessary packages.
library(corrplot)
library(dplyr)
library(ggplot2)
library(DescTools)
library(cluster)
df <- read.csv("imputedResponses.csv",na.strings=c(""," ","NA"))
is.finite.data.frame <- function(obj){
sapply(obj,FUN = function(x) all(is.finite(x)))
}
df <- df[rowSums(is.na(df)) == 0,]
We understand that body mass index is a better measure compared to height and weight individually,hence we replace the individual measures with the combined measure. We have established that the height is in cms and weight in kgs.
bmi = function(height,weight){
return(weight/(height/100)^2)
}
df$bmi = bmi(df$Height,df$Weight)
hist(df$bmi,col = "blue",breaks = 100,xlim = c(12,60),main="Histogram For BMI",xlab='BMI')
# Since BMI is the numerical value of a scale different from other variables we recode the data to the same scale.
# 1,2,3,4,5 being underweight,fit,healthy,overweight and obese respectively.
df$bmi[df$bmi <= 18.5] = 1
df$bmi[df$bmi > 18.5 & df$bmi <= 20] = 2
df$bmi[df$bmi > 20 & df$bmi <= 25] = 3
df$bmi[df$bmi > 25 & df$bmi <= 30] = 4
df$bmi[df$bmi > 30] = 5
nreqAttributes <- names(df) %in% c("Height","Weight")
df <- df[!nreqAttributes]
The process of feature extraction involves analyzing the data set to find the variables that should be used to classify, finding the relevant features to be used for classification. We study the personality traits group of our data set and extract the features as happinessSadness
happinessFactors <- c("Hypochondria","Loneliness","Dreams","Number.of.friends","Mood.swings",
"Getting.angry","Life.struggles","Happiness.in.life","Energy.levels","Personality")
happinessSadness <- df[happinessFactors]
# We consider the above mentoned variables as the factors of happiness and sadness.From their correlation
# plot we can infer that none of the variables under study are very highly correlated.So,we use these ten
# factors across sections of our dataset to find the variables which effect these factors the most and in
# the end effect the happiness / sadness of people.
#print("Distribution of Happiness Factors")
par(mfrow=c(2,5),mar=c(2,2,2,2))
for (factorV in happinessFactors){
hist(happinessSadness[[factorV]],breaks = c(0,1,2,3,4,5),freq = FALSE,col="#3399FF",main="",mgp=c(1,0,0),xlab=factorV)
}
#mtext("Distribution of Happiness factors",side=1,line=-32,adj=1,padj=0)
We need to verify our assumption,as to our these factors actually determinants of someone’s happiness in life. We use logistic regression to predict happiness in life using the other variables.
formulaStr <- paste(names(happinessSadness[c(1:7,9,10)]), collapse='+')
formulaStr <- paste("Happiness.in.life ~",formulaStr)
# as.formula(formulaStr)
logitModel <- glm( as.formula(formulaStr),data = happinessSadness, family = "poisson", maxit = 100)
print(summary(logitModel))
##
## Call:
## glm(formula = as.formula(formulaStr), family = "poisson", data = happinessSadness,
## maxit = 100)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.69487 -0.24435 0.01976 0.23531 0.91807
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.979392 0.161337 6.070 1.28e-09 ***
## Hypochondria 0.010196 0.015036 0.678 0.49768
## Loneliness -0.054791 0.016840 -3.254 0.00114 **
## Dreams 0.029316 0.025125 1.167 0.24328
## Number.of.friends 0.018341 0.017892 1.025 0.30530
## Mood.swings -0.011191 0.018429 -0.607 0.54369
## Getting.angry -0.001548 0.015445 -0.100 0.92017
## Life.struggles -0.004702 0.012685 -0.371 0.71086
## Energy.levels 0.060863 0.019636 3.100 0.00194 **
## Personality 0.041341 0.028409 1.455 0.14561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 197.78 on 985 degrees of freedom
## Residual deviance: 135.55 on 976 degrees of freedom
## AIC: 3276.3
##
## Number of Fisher Scoring iterations: 4
tempTestData <- df[sample(1:nrow(df),300,replace = TRUE),]
tempTestData <- tempTestData[happinessFactors]
predictedV <- predict(logitModel,tempTestData)
# polr(as.formula(formulaStr),happinessSadness)
plot(logitModel$residuals,logitModel$fitted.values,main="Plot of Fitted vs Residual values",xlab="Residuals",ylab="Fitted values")
# Function for Root Mean Squared Error
RMSE <- function(error) { sqrt(mean(error^2)) }
RMSE(logitModel$residuals)
## [1] 0.1947679
# If you want, say, MAE, you can do the following:
# Function for Mean Absolute Error
mae <- function(error) { mean(abs(error)) }
mae(logitModel$residuals)
## [1] 0.1500316
We perform Principal Component Analysis on the extracted features to obtain an independent variable which represents how Happy someone really is.
modify <- function(x) 5-x
modifiedHappinessSadness <- data.frame(happinessSadness[,c('Dreams','Number.of.friends','Happiness.in.life','Energy.levels','Personality')], lapply(happinessSadness[,c('Hypochondria','Loneliness','Mood.swings','Getting.angry','Life.struggles')], modify) )
pca <- prcomp(modifiedHappinessSadness)
pcaHappinessSadness = as.data.frame(pca$x[,1])
Happy <- vector(length = 986)
modifiedHappinessSadness = cbind(modifiedHappinessSadness,pcaHappinessSadness,Happy)
colnames(modifiedHappinessSadness)[11] <- "pcaHappinessSadness"
pcaCorrelation = cor(happinessSadness,pca$x[,1:4])
corrplot(pcaCorrelation,title="Correlation of principle components with happiness sadness factors",mar=c(0,0,2,0))
modifiedHappinessSadness$Happy[modifiedHappinessSadness$pcaHappinessSadness < 0] = "FALSE"
modifiedHappinessSadness$Happy[modifiedHappinessSadness$pcaHappinessSadness > 0] = "TRUE"
df$Happy <- modifiedHappinessSadness$Happy
happinessCount <- table(df$Happy,df$Gender)
barplot(happinessCount, main = paste("Happy vs Gender"), col = c("red","blue"))
legend("topright",legend = rownames(happinessCount),fill = c("red","blue") ,ncol = 1,cex = 0.4)
This function is used for finding contribution of the relevant attributes on principle components.
# Helper function
varCoordFunc <- function(loadings, comp.sdev){
loadings*comp.sdev
}
# Compute Coordinates
loadings <- pca$rotation
sdev <- pca$sdev
varCoord <- t(apply(loadings, 1, varCoordFunc, sdev))
# Compute Cos2
varCos2 <- varCoord^2
# Compute contributions
compCos2 <- apply(varCos2, 2, sum)
contrib <- function(varCos2, compCos2){varCos2*100/compCos2}
varContrib <- t(apply(varCos2,1, contrib, compCos2))
varContrib[,1:2]
## PC1 PC2
## Dreams 1.061990 0.778612578
## Number.of.friends 6.966440 17.987340141
## Happiness.in.life 6.750761 5.479806853
## Energy.levels 9.497671 14.810048896
## Personality 2.210171 1.061324333
## Hypochondria 6.691236 0.007977579
## Loneliness 18.781307 8.316843802
## Mood.swings 14.549957 0.810755366
## Getting.angry 10.596535 10.256665877
## Life.struggles 22.893931 40.490624574
We obtain relevant attributes by observing the correlation values. We compute the correlation between these attributes by using Goodman and Kruskal’s gamma method.
findCorrelation function-This is the function used to find rank correlation of various attributes in the data set with respect to a predictor variable like “Happy” or “Happiness in life” using Goodman and Kruskal’s gamma method. We compare values of each attribute with the predictor variable and consider only those attributes which give a gamma value<-0.25 or gamma value>0.25.
findCorrelation <- function(workingData,predictorVariable){
corrVals <- list()
i <- 1
for(variable in colnames(workingData)){
# print(variable)
corrVals[i] <- GoodmanKruskalGamma(workingData[[variable]],predictorVariable)
i <- i + 1
}
corrVals <- as.numeric(corrVals)
setNames(corrVals,colnames(workingData))
corrVals <- data.frame(corrVals)
corrVals$Attributes = as.vector(colnames(workingData))
corrValsModified <- corrVals
corrValsModified$Attributes <- factor(corrValsModified$Attributes, levels = corrValsModified$Attributes[order(-corrValsModified$corrVals)])
relevantTraits <- corrValsModified[(corrValsModified$corrVals >= 0.25 | corrValsModified$corrVals <= -0.25),]
p <- ggplot(relevantTraits, aes(x=Attributes,y=corrVals,fill = Attributes)) + geom_bar(stat="identity") + scale_fill_hue() + coord_flip()+ggtitle("Relevant Attributes")
print(p)
print(relevantTraits$Attributes)
return(relevantTraits)
}
We encode happiness value “TRUE” as 1 and happiness value “FALSE” as 0.
# Analysis of various traits wrt Happy Label.
df$Happy[df$Happy == TRUE] = 1
df$Happy[df$Happy == FALSE] = 0
df$Happy = as.numeric(df$Happy)
workingData <- df
nreqVariables = names(workingData) %in% happinessFactors
relevantAttributes1 <- findCorrelation(workingData = workingData[!nreqVariables],workingData$Happy)
## [1] Gender Health
## [3] Changing.the.past New.environment
## [5] Public.speaking Interests.or.hobbies
## [7] Storm Darkness
## [9] Spiders Fear.of.public.speaking
## [11] Cars Active.sport
## [13] Adrenaline.sports Romantic
## [15] Western Action
## [17] bmi Happy
## 141 Levels: Happy Gender Cars New.environment ... Darkness
relevantAttributes2 <- findCorrelation(workingData = workingData[!(names(workingData) %in% c('Happy','Happiness.in.life'))],workingData$Happiness.in.life)
## [1] Loneliness Changing.the.past Dreams
## [4] Number.of.friends Mood.swings Energy.levels
## [7] Personality Interests.or.hobbies Fun.with.friends
## 149 Levels: Energy.levels Personality Number.of.friends ... Loneliness
library(clue)
library(factoextra)
library(caret)
library(scales)
rawData1 has the attributes having correlation magnitude >0.25 with respect “Happy” variable. trainData1 has 80% of rawData1 which is used for training models. testData1 has 20% of rawData1 which is used for predicting based on the trained models. resultsDF dataframe has the predicted values of different clustering techniques along with “Happy” values of Test data used for comparison of accuracy.
# Preparing Data for clustering,scaling all the attributes to 1-5,omitting categorical data.
rawData1 <- na.omit(df[as.vector(relevantAttributes1$Attributes)])
rawData1$Happy <- na.omit(df$Happy)
nreqAttributes <- names(rawData1) %in% c("bmi","Gender")
rawData1 <- rawData1[!nreqAttributes]
# Scale Age weight and height.
rawData1$Age <- rescale(rawData1$Age, to = c(1, 5), from = range(rawData1$Age))
nrowsTrain <- 0.8*nrow(rawData1)
trainData1 <- rawData1[1:nrowsTrain,]
actualData1 <- df[nrowsTrain:nrow(df),]
testData1 <- rawData1[nrowsTrain:nrow(rawData1),]
# resultsDF <- data.frame(matrix(NA,nrow = nrow(testData1)))
resultsDF <- data.frame(testData1$Happy)
columnNames <- colnames(rawData1)
resultsList = list()
par(mfrow=c(2,5),mar=c(2,2,2,2))
for (factorV in columnNames[1:15]){
hist(trainData1[[factorV]],breaks = c(0,1,2,3,4,5),freq = FALSE,col="#3399FF",main="",mgp=c(1,0,0),xlab=factorV)
}
k-means clustering aims to partition the observations into k clusters in which each observation belongs to the cluster with the nearest mean. This method uses 2-norm distance metric(Euclidean distance) for classifying the observations into clusters. We consider k=4.
#K-Means clustering for the data
fviz_nbclust(trainData1, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)
set.seed(123)
km.res <- kmeans(trainData1, 3, nstart = 25,iter.max = 10000)
km.res$centers
## Health Changing.the.past New.environment Public.speaking
## 1 3.026756 2.886288 3.889632 3.076923
## 2 3.290456 3.016598 3.128631 3.680498
## 3 3.576613 3.104839 3.334677 4.028226
## Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking
## 1 4.003344 1.525084 1.625418 2.170569 2.367893
## 2 3.078838 1.709544 1.941909 2.278008 2.879668
## 3 3.443548 2.919355 3.366935 4.258065 3.185484
## Cars Active.sport Adrenaline.sports Romantic Western Action
## 1 3.541806 4.280936 4.036789 3.063545 2.581940 4.113712
## 2 2.033195 2.070539 1.917012 3.302905 1.933610 3.215768
## 3 2.225806 3.334677 2.737903 4.201613 1.693548 3.116935
## Happy
## 1 0.7123746
## 2 0.3941909
## 3 0.3306452
fviz_cluster(km.res, data = trainData1,main="K-means cluster plot wrt Happy factor with k=3")
kmeansCluster <- trainData1
kmeansCluster$clusterNo <- km.res$cluster
clusplot(kmeansCluster[c("Adrenaline.sports","Active.sport","Health")],kmeansCluster$clusterNo,main="Effect of active engagement in activities on happiness")
clusplot(kmeansCluster[c("Changing.the.past","Spiders","Storm","Darkness")],kmeansCluster$clusterNo,main = "Negative Impactors of Happy Clustered together")
We plot the clusters obtained after clustering.
library(gridExtra)
for(factorV in colnames(trainData1)){
tempdf <- kmeansCluster %>%
group_by(clusterNo,!!sym(factorV)) %>%
summarise(counts = n())
p <- ggplot(tempdf, aes(fill=!!sym(factorV), y = counts,x=clusterNo)) +
geom_bar(position="dodge", stat="identity",width = 0.3)+
labs(title=paste("Cluster Analysis for",factorV), x="Cluster No", y="No of people belonging to the cluster") +
theme(plot.title = element_text(size=16))
print(p)
}
# grid.arrange(p,nrow = 3,ncol = 5)
We predict the clusters for test data based on the k-means model trained and then compare the accuracy of the prediction by comparing with actual data.
predictedCluster <- as.vector(cl_predict(km.res,testData1))
resultsDF$PredictedKmean = predictedCluster
resultsDF$PredictedKmean[resultsDF$PredictedKmean == 3| resultsDF$PredictedKmean == 2] = 0
resultsDF$PredictedKmean[resultsDF$PredictedKmean == 1] = 1
resultsList$Kmeans <- confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedKmean,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$Kmeans)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 20
## 1 43 65
##
## Accuracy : 0.6818
## 95% CI : (0.612, 0.746)
## No Information Rate : 0.5707
## P-Value [Acc > NIR] : 0.0008779
##
## Kappa : 0.3717
## Mcnemar's Test P-Value : 0.0055758
##
## Precision : 0.6019
## Recall : 0.7647
## F1 : 0.6736
## Prevalence : 0.4293
## Detection Rate : 0.3283
## Detection Prevalence : 0.5455
## Balanced Accuracy : 0.6921
##
## 'Positive' Class : 1
##
rawData2 <- na.omit(df[as.vector(relevantAttributes2$Attributes)])
rawData2$Happiness.in.life <- na.omit(df$Happiness.in.life)
rawData2$happinessModified = rawData2$Happiness.in.life
rawData2$happinessModified[rawData2$happinessModified<=3] = 0
rawData2$happinessModified[rawData2$happinessModified>3] = 1
#rawData2[-c(10)]
nrowsTrain <- 0.9*nrow(rawData2)
trainData2 <- rawData2[1:nrowsTrain,]
actualData <- df[nrowsTrain:nrow(df),]
testData2 <- rawData2[nrowsTrain:nrow(rawData2),]
fviz_nbclust(trainData2, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)
set.seed(123)
km.res <- kmeans(trainData2, 3, nstart = 25)
tempCenters <- as.data.frame(km.res$centers)
print(tempCenters)
## Loneliness Changing.the.past Dreams Number.of.friends Mood.swings
## 1 3.529210 3.333333 3.130584 2.587629 3.611684
## 2 3.049808 3.938697 3.279693 3.739464 3.532567
## 3 2.164179 1.874627 3.489552 3.737313 2.737313
## Energy.levels Personality Interests.or.hobbies Fun.with.friends
## 1 2.786942 2.996564 2.597938 4.219931
## 2 4.061303 3.394636 4.049808 4.727969
## 3 4.125373 3.483582 4.026866 4.704478
## Happiness.in.life happinessModified
## 1 3.151203 0.3608247
## 2 3.685824 0.6398467
## 3 4.194030 0.9014925
fviz_cluster(km.res, data = trainData2,main="K-means cluster plot wrt Happiness in life with k=3")
predictedCluster <- as.vector(cl_predict(km.res,testData2))
testData2$PredictedKmean2 = predictedCluster
testData2$PredictedKmean[testData2$PredictedKmean == 5] = 2
testData2$PredictedKmean[testData2$PredictedKmean == 3 ] = 3
testData2$PredictedKmean[testData2$PredictedKmean == 4 | testData2$PredictedKmean == 1| testData2$PredictedKmean == 7|testData2$PredictedKmean == 2 ] = 5
confusionMatrix(factor(actualData$Happiness.in.life,levels = 1:5),factor(testData2$PredictedKmean,levels = 1:5),mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 4
## 3 0 0 4 0 28
## 4 0 0 24 0 26
## 5 0 0 11 0 2
##
## Overall Statistics
##
## Accuracy : 0.0606
## 95% CI : (0.0226, 0.1273)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.1845
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Precision NA 0.0000 0.1250 0.0000 0.15385
## Recall NA NA 0.1026 NA 0.03333
## F1 NA NA 0.1127 NA 0.05479
## Prevalence 0 0.0000 0.3939 0.0000 0.60606
## Detection Rate 0 0.0000 0.0404 0.0000 0.02020
## Detection Prevalence 0 0.0404 0.3232 0.5051 0.13131
## Balanced Accuracy NA NA 0.3179 NA 0.37564
testData2$PredictedKmean2[testData2$PredictedKmean2 == 1 | testData2$PredictedKmean2 == 2 ] = 0
testData2$PredictedKmean2[testData2$PredictedKmean2 == 3] = 1
confusionMatrix(factor(actualData$Happiness.in.life,levels = 1:5),factor(testData2$PredictedKmean,levels = 1:5),mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 0 0 0 0 0
## 2 0 0 0 0 4
## 3 0 0 4 0 28
## 4 0 0 24 0 26
## 5 0 0 11 0 2
##
## Overall Statistics
##
## Accuracy : 0.0606
## 95% CI : (0.0226, 0.1273)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.1845
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Precision NA 0.0000 0.1250 0.0000 0.15385
## Recall NA NA 0.1026 NA 0.03333
## F1 NA NA 0.1127 NA 0.05479
## Prevalence 0 0.0000 0.3939 0.0000 0.60606
## Detection Rate 0 0.0000 0.0404 0.0000 0.02020
## Detection Prevalence 0 0.0404 0.3232 0.5051 0.13131
## Balanced Accuracy NA NA 0.3179 NA 0.37564
Hierarchical clustering is an approach for identifying clustering in the complete rawData1 dataset by using pairwise distance matrix between observations as clustering criteria. We perform hierarchical clustering using k=2 i.e classifying into 2 clusters. We have used “complete” method of hierarchical clustering.
#Hierarchical clustering for all the relevant attributes
##For k=2
library(dendextend)
library(colorspace)
distCalc <- dist(rawData1)
#Using the "complete" method for clustering as it gives better accuracy
hclusters <- hclust(distCalc, method = "complete")
happyLevels <- rev(levels(as.factor(rawData1$Happy)))
#Forming the dendrogram
dend <- as.dendrogram(hclusters)
dend <- rotate(dend, 1:150)
dend <- color_branches(dend, k=2)
labels_colors(dend) <-
rainbow_hcl(2)[sort_levels_values(
as.numeric(rawData1$Happy)[order.dendrogram(dend)]
)]
labels(dend) <- paste(as.character(rawData1$Happy)[order.dendrogram(dend)],"(",labels(dend),")", sep = "")
dend <- hang.dendrogram(dend,hang_height=0.1)
dend <- set(dend, "labels_cex", 0.5)
#Plotting the dendrogram
par(mfrow=c(1,1))
plot(dend,
main = "Hierarchical clustering of relevant attributes",
horiz = TRUE, nodePar = list(cex = .007))
legend("topleft", legend = happyLevels, fill = rainbow_hcl(2))
hclusteringMethod <- c("complete")
hclustDendlist <- dendlist()
for(i in seq_along(hclusteringMethod)) {
hCluster <- hclust(distCalc , method = hclusteringMethod[i])
hclustDendlist <- dendlist(hclustDendlist, as.dendrogram(hCluster))
}
hclustDendlist
## [[1]]
## 'dendrogram' with 2 branches and 986 members total, at height 13.56466
##
## attr(,"class")
## [1] "dendlist"
getClusters <- function(dend) {
cutree(dend, k =2)[order.dendrogram(dend)]
}
dendClusters <- lapply(hclustDendlist, getClusters)
dendClusters<-as.data.frame(dendClusters)
colnames(dendClusters)[1]<-"predicted.clusters"
modifiedDendClusters<-dendClusters
for (i in 1:nrow(modifiedDendClusters))
{
#If it is cluster 1,Happy value =0
if(modifiedDendClusters$predicted.clusters[i]==1)
modifiedDendClusters$predicted.clusters[i]<-0
else
#If it is cluster 2,Happy value =1
modifiedDendClusters$predicted.clusters[i]<-1
}
#For the confusion Matrix
referenceData<-rawData1$Happy
predictedData<-modifiedDendClusters$predicted.clusters
unionData <- union(predictedData,referenceData)
tableData <- table(factor(predictedData, unionData), factor(referenceData, unionData))
confusionMatrix(tableData,positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 215 199
## 1 273 299
##
## Accuracy : 0.5213
## 95% CI : (0.4896, 0.5529)
## No Information Rate : 0.5051
## P-Value [Acc > NIR] : 0.1617510
##
## Kappa : 0.041
## Mcnemar's Test P-Value : 0.0007792
##
## Precision : 0.5227
## Recall : 0.6004
## F1 : 0.5589
## Prevalence : 0.5051
## Detection Rate : 0.3032
## Detection Prevalence : 0.5801
## Balanced Accuracy : 0.5205
##
## 'Positive' Class : 1
##
Here also we classify into 2 clusters by taking k=2.
#Hierarchical clustering and prediction using test data
library(dendextend)
library(colorspace)
distCalcTest <- dist(testData1)
#Using the "ward.D" method for clustering as it gives better accuracy
hclustersTest <- hclust(distCalcTest, method = "ward.D")
happyLevelsTest <- rev(levels(as.factor(testData1$Happy)))
#Forming the dendrogram
dendTest <- as.dendrogram(hclustersTest)
dendTest <- rotate(dendTest, 1:150)
dendTest <- color_branches(dendTest, k=2)
labels_colors(dendTest) <-
rainbow_hcl(2)[sort_levels_values(
as.numeric(testData1$Happy)[order.dendrogram(dendTest)]
)]
labels(dend) <- paste(as.character(testData1$Happy)[order.dendrogram(dendTest)],"(",labels(dendTest),")", sep = "")
dendTest <- hang.dendrogram(dendTest,hang_height=0.1)
dendTest <- set(dendTest, "labels_cex", 0.5)
#Plotting the dendrogram
par(mfrow=c(1,1))
plot(dendTest,
main = "Hierarchical Clustering on Test Data",
horiz = TRUE, nodePar = list(cex = .007))
legend("topleft", legend = happyLevelsTest, fill = rainbow_hcl(2))
We use the “ward.D” method of hierarchical clustering.
hclusteringMethodTest <- c("ward.D")
hclustDendlistTest <- dendlist()
for(i in seq_along(hclusteringMethodTest)) {
hClusterTest <- hclust(distCalcTest , method = hclusteringMethodTest[i])
hclustDendlistTest <- dendlist(hclustDendlistTest, as.dendrogram(hClusterTest))
}
#hclustDendlistTest
getClustersTest <- function(dendTest) {
cutree(dendTest, k =2)[order.dendrogram(dendTest)]
}
dendClustersTest <- lapply(hclustDendlistTest, getClustersTest)
dendClustersTest<-as.data.frame(dendClustersTest)
colnames(dendClustersTest)[1]<-"predictedHierarchical"
modifiedDendClustersTest<-dendClustersTest
for (i in 1:nrow(modifiedDendClustersTest))
{
#If it is cluster 1,Happy value = 0
if(modifiedDendClustersTest$predictedHierarchical[i]==1)
modifiedDendClustersTest$predictedHierarchical[i]<-0
else
#If it is cluster 2,Happy value = 1
modifiedDendClustersTest$predictedHierarchical[i]<-1
}
resultsDF<-cbind(resultsDF,modifiedDendClustersTest)
We compare the accuracy of the predicted and actual values obtained by hierarchical clustering.
#For the confusion Matrix
referenceDataTest<-testData1$Happy
predictedDataHierarchicalTest<-resultsDF$predictedHierarchical
unionDataHierarchicalTest <- union(predictedDataHierarchicalTest,referenceDataTest)
tableDataHierarchicalTest <- table(factor(predictedDataHierarchicalTest, unionDataHierarchicalTest), factor(referenceDataTest, unionDataHierarchicalTest))
resultsList$Hierarchical <- confusionMatrix(tableDataHierarchicalTest,positive = '1',mode="prec_recall")
print(resultsList$Hierarchical)
## Confusion Matrix and Statistics
##
##
## 1 0
## 1 68 33
## 0 40 57
##
## Accuracy : 0.6313
## 95% CI : (0.56, 0.6986)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 0.008903
##
## Kappa : 0.2613
## Mcnemar's Test P-Value : 0.482525
##
## Precision : 0.6733
## Recall : 0.6296
## F1 : 0.6507
## Prevalence : 0.5455
## Detection Rate : 0.3434
## Detection Prevalence : 0.5101
## Balanced Accuracy : 0.6315
##
## 'Positive' Class : 1
##
It is a partitional algorithm related to K-means and medoid shifting algorithms which uses medoids i.e., the points in the data set as centres We have partitioned into six clusters.
#k-medoid Clustering
library(cluster)
set.seed(123)
pam.res <- pam(trainData1, 6)
pam.res$medoids
## Health Changing.the.past New.environment Public.speaking
## 570 3 3 4 2
## 662 3 3 4 4
## 765 4 4 2 5
## 600 3 3 3 3
## 30 4 3 3 5
## 196 3 4 3 3
## Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking
## 570 4 2 3 2 2
## 662 3 2 2 2 3
## 765 3 2 2 5 4
## 600 4 1 2 2 3
## 30 4 3 4 5 3
## 196 3 2 3 5 3
## Cars Active.sport Adrenaline.sports Romantic Western Action Happy
## 570 2 5 3 5 2 3 1
## 662 3 2 2 3 1 3 1
## 765 1 2 1 4 1 3 0
## 600 4 4 4 3 3 5 1
## 30 2 4 3 5 2 2 0
## 196 4 4 3 4 2 4 0
fviz_cluster(pam.res, data = trainData1,main="K-Medoid Clustering with 6 clusters")
predictedClusterPam <- as.vector(cl_predict(pam.res,testData1))
resultsDF$PredictedPam = predictedClusterPam
resultsDF$PredictedPam[resultsDF$PredictedPam == 3 |resultsDF$PredictedPam == 5 | resultsDF$PredictedPam == 6] = 0
resultsDF$PredictedPam[resultsDF$PredictedPam == 1| resultsDF$PredictedPam == 2 | resultsDF$PredictedPam == 4] = 1
resultsList$KMedoid = confusionMatrix(factor(resultsDF$PredictedPam,levels = 0:1),factor(testData1$Happy,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$KMedoid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 19
## 1 45 89
##
## Accuracy : 0.6768
## 95% CI : (0.6068, 0.7413)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 0.0001136
##
## Kappa : 0.3321
## Mcnemar's Test P-Value : 0.0017781
##
## Precision : 0.6642
## Recall : 0.8241
## F1 : 0.7355
## Prevalence : 0.5455
## Detection Rate : 0.4495
## Detection Prevalence : 0.6768
## Balanced Accuracy : 0.6620
##
## 'Positive' Class : 1
##
k-medians clustering partitions the observations into k clusters in which each observation belongs to the cluster with the nearest median. This method uses 1-norm distance metric(Manhattan distance) for classifying the observations into clusters. We consider k=4 and perform k-medians clustering on the Training data.This model is used to predict the clusters for Test data.
library(flexclust)
set.seed(12)
#Considering k=4
#To iterate and train the model 15 times
for(i in 1:15)
{
kMedian = kcca(trainData1, k=4, kccaFamily("kmedians"),save.data = TRUE)
}
kMedianValues = parameters(kMedian)
#To print the median value of each cluster
print(kMedianValues)
## Health Changing.the.past New.environment Public.speaking
## [1,] 3 3 4 4
## [2,] 3 3 3 4
## [3,] 3 3 4 2
## [4,] 4 3 3 4
## Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking
## [1,] 4 1 1 2 3
## [2,] 3 1 2 2 3
## [3,] 4 1 2 3 2
## [4,] 3 3 3 5 3
## Cars Active.sport Adrenaline.sports Romantic Western Action Happy
## [1,] 3 5 4 3 3 4 1
## [2,] 2 2 1 3 1 3 0
## [3,] 4 4 4 4 2 4 1
## [4,] 2 3 3 4 1 3 0
kMedianTrainClusters<-clusters(kMedian)
#Plotting the clusters
clusplot(trainData1,kMedianTrainClusters,main = paste("CLUSPLOT For K Medians(k=4)"))
predClusterMedian<- predict(kMedian, newdata=testData1, k=4, kccaFamily("kmedians"))
From the above predicted clusters, we observe that clusters 1,3 have median values 0 and clusters 2,4 have median value as 1. Therefore we recode the cluster values as the median values in order to compare with the actual Happy values.
resultsDF$predictedKMedian = predClusterMedian
resultsDF$predictedKMedian[resultsDF$predictedKMedian == 2| resultsDF$predictedKMedian == 4] = 0
resultsDF$predictedKMedian[resultsDF$predictedKMedian == 1| resultsDF$predictedKMedian == 3] = 1
#Computing a confusion matrix of the predicted and actual data
resultsList$KMedian <- confusionMatrix(factor(resultsDF$predictedKMedian,levels = 0:1),factor(testData1$Happy ,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$KMedian)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 62 34
## 1 28 74
##
## Accuracy : 0.6869
## 95% CI : (0.6173, 0.7507)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 3.407e-05
##
## Kappa : 0.372
## Mcnemar's Test P-Value : 0.5254
##
## Precision : 0.7255
## Recall : 0.6852
## F1 : 0.7048
## Prevalence : 0.5455
## Detection Rate : 0.3737
## Detection Prevalence : 0.5152
## Balanced Accuracy : 0.6870
##
## 'Positive' Class : 1
##
Fuzzy c-means(soft clustering or soft k-means) is a clustering technique which allows each observation to belong to more than one cluster. Each observation is assigned a membership grade for each cluster it belongs to. We partition into 3 clusters.
# set.seed(1)
library(cluster)
library(e1071)
cmeansCluster <- cmeans(trainData1,centers = 3 ,iter.max = 100, verbose = FALSE,dist = "manhattan", method = "cmeans", m = 2,rate.par = NULL, weights = 1, control = list())
print(cmeansCluster$centers)
## Health Changing.the.past New.environment Public.speaking
## 1 3 3 4 4
## 2 3 3 4 3
## 3 3 3 3 4
## Interests.or.hobbies Storm Darkness Spiders Fear.of.public.speaking Cars
## 1 4 1 2 2 3 3
## 2 4 2 2 2 3 3
## 3 3 2 2 3 3 2
## Active.sport Adrenaline.sports Romantic Western Action Happy
## 1 4 3 3 2 4 1
## 2 4 3 3 2 4 1
## 3 3 3 4 2 3 0
clusplot(trainData1,cmeansCluster$cluster,main="CLUSPLOT for Fuzzy C-Means with 3 clusters")
resultsDF$PredictedCmeans <- cl_predict(cmeansCluster,testData1,type = "class_ids")
resultsDF$PredictedCmeans[resultsDF$PredictedCmeans == 3] = 0
resultsDF$PredictedCmeans[resultsDF$PredictedCmeans == 1| resultsDF$PredictedCmeans == 2] = 1
resultsList$Cmeans <- confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedCmeans,levels = 0:1),positive = '1',mode="prec_recall")
print(resultsList$Cmeans)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 34
## 1 21 87
##
## Accuracy : 0.7222
## 95% CI : (0.6543, 0.7834)
## No Information Rate : 0.6111
## P-Value [Acc > NIR] : 0.0006932
##
## Kappa : 0.433
## Mcnemar's Test P-Value : 0.1056454
##
## Precision : 0.8056
## Recall : 0.7190
## F1 : 0.7598
## Prevalence : 0.6111
## Detection Rate : 0.4394
## Detection Prevalence : 0.5455
## Balanced Accuracy : 0.7231
##
## 'Positive' Class : 1
##
Decision Tree creates a model that predicts the value of target variable by learning simple decision rules inferred from the features.We have implemented the CART and C5.0(an extension of ID3 algorithm) algorithms for decision tree.
#Decision Tree using CART method
set.seed(1)
library(rpart)
library(rpart.plot)
fit <- rpart(Happy~., data = trainData1, method = 'class')
rpart.plot(fit, extra = 101,main="Decision Tree using CART method")
predictedVal <-predict(fit, testData1, type = 'class')
tableMat <- table(testData1$Happy, predictedVal)
resultsList$DecTreeCart <- confusionMatrix(tableMat,positive = '1',mode="prec_recall")
C5.0 is an extension of ID3 algorithm to generate a decision tree and uses normalized information gain as the criteria to split the samples into a class.
#Decision Tree using C5.0
library(C50)
set.seed(123)
formulaStrDt <- paste(names(trainData1[c(1:2,5,9,13)]), collapse='+')
formulaStrDt <- paste("factor(Happy) ~",formulaStrDt)
model <- C5.0(as.formula(formulaStrDt),data=trainData1)
plot(model,main="Decision Tree for C5.0 method")
resultsDF$predictedDTC5 = predict(model,testData1)
resultsList$DecTreeC50 <- confusionMatrix(factor(resultsDF$predictedDTC5),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$DecTreeC50)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 63 33
## 1 27 75
##
## Accuracy : 0.697
## 95% CI : (0.6278, 0.7601)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 9.344e-06
##
## Kappa : 0.3923
## Mcnemar's Test P-Value : 0.5186
##
## Precision : 0.7353
## Recall : 0.6944
## F1 : 0.7143
## Prevalence : 0.5455
## Detection Rate : 0.3788
## Detection Prevalence : 0.5152
## Balanced Accuracy : 0.6972
##
## 'Positive' Class : 1
##
Random Forest is an ensemble classification method that generates multiple decision trees and outputs the mode of the classes.
#Decision tree using Random Forest
library(randomForest)
set.seed(1)
rf = randomForest(factor(Happy)~.,data = trainData1,ntree=750)
plot(rf,main="Error plot for Random Forest")
varImpPlot(rf,main="Importance of variables")
resultsDF$predictedRf = predict(rf,testData1)
resultsList$DecTreeRF <- confusionMatrix(factor(resultsDF$predictedRf),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$DecTreeRF)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 64 27
## 1 26 81
##
## Accuracy : 0.7323
## 95% CI : (0.6649, 0.7926)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 4.834e-08
##
## Kappa : 0.4607
## Mcnemar's Test P-Value : 1
##
## Precision : 0.7570
## Recall : 0.7500
## F1 : 0.7535
## Prevalence : 0.5455
## Detection Rate : 0.4091
## Detection Prevalence : 0.5404
## Balanced Accuracy : 0.7306
##
## 'Positive' Class : 1
##
It is a supervised machine learning algorithm which plots the data points in an n-dimensional space where n is the number of features and the value of a particular coordinate is the value of the feature.
#SVM linear
svmLinearModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="linear")
resultsDF$predictedSvmLinear = predict(svmLinearModel,testData1)
resultsList$svmLin <- confusionMatrix(factor(resultsDF$predictedSvmLinear),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmLin)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 60 28
## 1 30 80
##
## Accuracy : 0.7071
## 95% CI : (0.6384, 0.7694)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 2.338e-06
##
## Kappa : 0.4082
## Mcnemar's Test P-Value : 0.8955
##
## Precision : 0.7273
## Recall : 0.7407
## F1 : 0.7339
## Prevalence : 0.5455
## Detection Rate : 0.4040
## Detection Prevalence : 0.5556
## Balanced Accuracy : 0.7037
##
## 'Positive' Class : 1
##
plot(svmLinearModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies)
#SVM polynomial
svmPolyModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="polynomial")
resultsDF$predictedSvmPoly = predict(svmPolyModel,testData1)
resultsList$svmPoly <- confusionMatrix(factor(resultsDF$predictedSvmPoly),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmPoly)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 55 23
## 1 35 85
##
## Accuracy : 0.7071
## 95% CI : (0.6384, 0.7694)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 2.338e-06
##
## Kappa : 0.4026
## Mcnemar's Test P-Value : 0.1486
##
## Precision : 0.7083
## Recall : 0.7870
## F1 : 0.7456
## Prevalence : 0.5455
## Detection Rate : 0.4293
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.6991
##
## 'Positive' Class : 1
##
plot(svmPolyModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies)
#SVM radial
svmRadialModel = svm(Happy~.,data = trainData1,type="C-classification",kernel="radial")
resultsDF$predictedSvmRadial = predict(svmRadialModel,testData1)
resultsList$svmRadial <- confusionMatrix(factor(resultsDF$predictedSvmRadial),factor(testData1$Happy),positive = '1',mode="prec_recall")
print(resultsList$svmRadial)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 30
## 1 25 78
##
## Accuracy : 0.7222
## 95% CI : (0.6543, 0.7834)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 2.453e-07
##
## Kappa : 0.4424
## Mcnemar's Test P-Value : 0.5896
##
## Precision : 0.7573
## Recall : 0.7222
## F1 : 0.7393
## Prevalence : 0.5455
## Detection Rate : 0.3939
## Detection Prevalence : 0.5202
## Balanced Accuracy : 0.7222
##
## 'Positive' Class : 1
##
plot(svmPolyModel,data=trainData1,Changing.the.past ~ Interests.or.hobbies)
resultListNames <- names(resultsList)
for(i in c(1:11)){
print(resultListNames[i])
print(resultsList[[i]]$overall["Accuracy"])
print(resultsList[[i]]$byClass["F1"])
}
## [1] "Kmeans"
## Accuracy
## 0.6818182
## F1
## 0.6735751
## [1] "Hierarchical"
## Accuracy
## 0.6313131
## F1
## 0.6507177
## [1] "KMedoid"
## Accuracy
## 0.6767677
## F1
## 0.7355372
## [1] "KMedian"
## Accuracy
## 0.6868687
## F1
## 0.7047619
## [1] "Cmeans"
## Accuracy
## 0.7222222
## F1
## 0.7598253
## [1] "DecTreeCart"
## Accuracy
## 0.6616162
## F1
## 0.685446
## [1] "DecTreeC50"
## Accuracy
## 0.6969697
## F1
## 0.7142857
## [1] "DecTreeRF"
## Accuracy
## 0.7323232
## F1
## 0.7534884
## [1] "svmLin"
## Accuracy
## 0.7070707
## F1
## 0.733945
## [1] "svmPoly"
## Accuracy
## 0.7070707
## F1
## 0.745614
## [1] "svmRadial"
## Accuracy
## 0.7222222
## F1
## 0.7393365
Majority vote: It is defined as taking the prediction with maximum vote / recommendation from multiple models predictions while predicting the outcomes of a classification problem.
determineMajority <- function(resDF,n){
for(i in 1:nrow(resDF))
{
if(rowSums(resDF[i,]) >= n) {
resDF$PredictedVote[i] = 1
}
else{
resDF$PredictedVote[i] = 0
}
}
return (resDF$PredictedVote)
}
resultsDF[] <- lapply(resultsDF, function(x) as.numeric(as.character(x)))
resultsDF <- as.data.frame(resultsDF)
resultsDF$PredictedVote = 1:nrow(resultsDF)
resultsDF$PredictedClusterVote = determineMajority(resultsDF[c(2,3:5)],3)
resultsDF$PredictedVote = determineMajority(resultsDF[c(2,3,5,8,11)],3)
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedClusterVote,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 34
## 1 19 89
##
## Accuracy : 0.7323
## 95% CI : (0.6649, 0.7926)
## No Information Rate : 0.6212
## P-Value [Acc > NIR] : 0.0006429
##
## Kappa : 0.4526
## Mcnemar's Test P-Value : 0.0544740
##
## Precision : 0.8241
## Recall : 0.7236
## F1 : 0.7706
## Prevalence : 0.6212
## Detection Rate : 0.4495
## Detection Prevalence : 0.5455
## Balanced Accuracy : 0.7351
##
## 'Positive' Class : 1
##
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedVote,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 34
## 1 21 87
##
## Accuracy : 0.7222
## 95% CI : (0.6543, 0.7834)
## No Information Rate : 0.6111
## P-Value [Acc > NIR] : 0.0006932
##
## Kappa : 0.433
## Mcnemar's Test P-Value : 0.1056454
##
## Precision : 0.8056
## Recall : 0.7190
## F1 : 0.7598
## Prevalence : 0.6111
## Detection Rate : 0.4394
## Detection Prevalence : 0.5455
## Balanced Accuracy : 0.7231
##
## 'Positive' Class : 1
##
Weighted average: In this, different weights are applied to predictions from multiple models then taking the average which means giving high or low importance to specific model output. Assigning weights based on the accuracies of the models. wAverageDetermination-This function is used to pass each row of the required columns from the data to the wAverageRow function. wAverageRow function takes each row of data passed to it and averages based on the weights assigned to different algorithms.
importantResults <- resultsDF[c(2,3,4,5,6,8,11)]
corrMatrix <- cor(importantResults)
corrplot(corrMatrix,method = "number",title = "Correlation Between Different Models",mar = c(0,0,2,0))
corrplot(cov(importantResults),method = "number",title = "Covariance Between Different Models",mar = c(0,0,2,0))
From the above results we observe there is no significant correlation or covariance between two models.Hence considering all of them is essential.The weights can hence be assigned according to the accuracies of the different models. We observe a significant correlation between Kmeans and Kmedian proving the similarity of the methods altogether. We consider the other algorithms with following weights
# table(resultsDF)
wAverageRow <- function(row){
weights <- c(3,1,3,3,3,7)/20
return(weighted.mean(row,weights))
}
wAverageDetermination <- function(){
for(i in 1:nrow(resultsDF))
{
resultsDF$PredictedWA[i] = wAverageRow(resultsDF[i,][c(2,3,4,8,11,6)])
if(resultsDF$PredictedWA[i] >= 0.5){
resultsDF$PredictedVote[i] = 1
}
else{
resultsDF$PredictedVote[i] = 0
}
}
return (resultsDF$PredictedWA)
}
resultsDF$PredictedWA = 1:nrow(resultsDF)
resultsDF$PredictedWA = wAverageDetermination()
confusionMatrix(factor(actualData1$Happy,levels = 0:1),factor(resultsDF$PredictedWA,levels = 0:1),positive = '1',mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23 5
## 1 4 35
##
## Accuracy : 0.8657
## 95% CI : (0.7603, 0.9367)
## No Information Rate : 0.597
## P-Value [Acc > NIR] : 1.559e-06
##
## Kappa : 0.7225
## Mcnemar's Test P-Value : 1
##
## Precision : 0.8974
## Recall : 0.8750
## F1 : 0.8861
## Prevalence : 0.5970
## Detection Rate : 0.5224
## Detection Prevalence : 0.5821
## Balanced Accuracy : 0.8634
##
## 'Positive' Class : 1
##
# hist(resultsDF$PredictedWA)